##### 0.1 Preliminaries: setting working directory #######################################################################
## define working directory on each computer
wd_laptop <- "/Users/mark.hebblewhite/Dropbox/WILD 562/Spring2017/lab6/"
wd_desktop <- "/Users/mark.hebblewhite/Documents/Teaching/UofMcourses/WILD562/Spring2017/Labs/lab6"
## automatically set working directory depending which computer you're on
ifelse(file.exists(wd_laptop), setwd(wd_laptop), setwd(wd_desktop))
## [1] "/Users/mark.hebblewhite/Dropbox/WILD 562/Spring2017/lab6"
getwd()
## [1] "/Users/mark.hebblewhite/Dropbox/WILD 562/Spring2017/lab6"
list.files() ## handy command to see what is inside the working directory
## [1] "clumpydata.pdf" "homerangeALL.dbf"
## [3] "homerangeALL.prj" "homerangeALL.shp"
## [5] "homerangeALL.shx" "kxv.pdf"
## [7] "kxv.R" "lab6.html"
## [9] "lab6.R" "lab6.rmd"
## [11] "Lec_12_Evaluating RSFs.pptx" "pronghorn.csv"
## [13] "rasters" "rasters.zip"
## [15] "readings" "sampleK-folds_spreadsheet.xls"
## [17] "WILD562_Lab6_Evaluating RSF.docx" "wolfkde.csv"
## [19] "wolfmcp.csv"
wolfkde2 <- read.csv("wolfkde.csv", header=TRUE, sep = ",", na.strings="NA", dec=".")
wolfkde3 <-na.omit(wolfkde2)
wolfkde3$usedFactor <-as.factor(wolfkde3$usedFactor) ## make sure usedFactor is a factor
# head(wolfkde3)
length(wolfkde3$used)
## [1] 2118
#source("/Users/mark.hebblewhite/Dropbox/WILD 562/Spring2017/lab4/new/Lab2NeededforLab5.R", verbose = FALSE)
# plot(homerangeALL)
#writeOGR(homerangeALL, dsn=wd_laptop, layer = "homerangeALL", driver = "ESRI Shapefile", overwrite_layer=TRUE)
kernelHR <- readOGR(dsn=wd_laptop, "homerangeALL")
## OGR data source with driver: ESRI Shapefile
## Source: "/Users/mark.hebblewhite/Dropbox/WILD 562/Spring2017/lab6/", layer: "homerangeALL"
## with 2 features
## It has 2 fields
plot(kernelHR)
extent(kernelHR)
## class : Extent
## xmin : 546836.6
## xmax : 612093.9
## ymin : 5662036
## ymax : 5748911
kernels <- raster()
extent(kernels) <- c(xmin=546836, xmax=612093, ymin=5662036, ymax=5748911)
setwd("/Users/mark.hebblewhite/Dropbox/WILD 562/Spring2017/lab6/rasters")
#list.files()
deer_w<-raster("deer_w2.tif")
moose_w<-raster("moose_w2.tif")
elk_w<-raster("elk_w2.tif") # already brought in above
sheep_w<-raster("sheep_w2.tif")
goat_w<-raster("goat_w2.tif")
wolf_w<-raster("wolf_w2.tif")
elevation2<-raster("Elevation2.tif") #resampled
disthumanaccess2<-raster("DistFromHumanAccess2.tif") #resampled in lab 4
disthhu2<-raster("DistFromHighHumanAccess2.tif") #resampled in lab 4
landcover2 <- raster("landcover2.tif") ## resampled to same extent as lab 4
## but note we need to repopulate the fields with the habitat legend information
landcover2@data@values <- getValues(landcover2)
## note that the extents are all different for human access and elc_habitat-derived layers, so need to recreate a new extent
#create an empty raster
mask.raster <- raster()
#set extent (note that I customized this extent so it covered both elc_habitat and humanacess)
extent(mask.raster) <- c(xmin=443680.6, xmax=650430.4, ymin=5618405, ymax=5789236)
#set the resolution to 30 m
res(mask.raster)<-30
#match projection to elc_habitat shapefile
projection(mask.raster)<- "+proj=utm +zone=11 +datum=NAD83 +units=m +no_defs +ellps=GRS80 +towgs84=0,0,0"
#set all values of mask.raster to zero
mask.raster[]<-0
# duplicate mask.raster in the creation of new 'dummy' rasters
alpine <- mask.raster
burn <- mask.raster
closedConif <- mask.raster
herb <- mask.raster
mixed <- mask.raster
rockIce <- mask.raster
water <- mask.raster
modConif <- mask.raster
decid <- mask.raster
#set values for empty rasters based on landcover2 values of Habitat classification variables
alpine@data@values <- ifelse(landcover2@data@values== 15 | landcover2@data@values == 16, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
burn@data@values <- ifelse(landcover2@data@values == 12 | landcover2@data@values == 13 | landcover2@data@values == 14, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
closedConif@data@values <- ifelse(landcover2@data@values == 3, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
herb@data@values <- ifelse(landcover2@data@values == 7, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
mixed@data@values <- ifelse(landcover2@data@values == 5, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
rockIce@data@values <- ifelse(landcover2@data@values == 10, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
water@data@values <- ifelse(landcover2@data@values == 9, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
modConif@data@values <- ifelse(landcover2@data@values == 2, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
decid@data@values <- ifelse(landcover2@data@values == 10, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
plot(rockIce)
plot(kernelHR, add=TRUE)
plot(closedConif)
plot(kernelHR, add=TRUE)
# note that open conifer as intercept
#stack raster layers (i.e., create raster stack for sampling; must have same extent and resolution)
all_rasters<-stack(deer_w, moose_w, elk_w, sheep_w, goat_w, wolf_w,elevation2, disthumanaccess2, disthhu2, landcover2, alpine, burn, closedConif, modConif, herb, mixed, rockIce, water, decid)
##### Running the 'top' models from last week
## top Biotic model was model 41
top.biotic <- glm(used ~ DistFromHumanAccess2+deer_w2 + goat_w2, family=binomial(logit), data=wolfkde3)
summary(top.biotic)
##
## Call:
## glm(formula = used ~ DistFromHumanAccess2 + deer_w2 + goat_w2,
## family = binomial(logit), data = wolfkde3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8345 -0.6246 -0.1888 -0.0306 3.1247
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.553037 0.366553 -9.693 < 2e-16 ***
## DistFromHumanAccess2 -0.001422 0.000183 -7.770 7.86e-15 ***
## deer_w2 0.898069 0.077941 11.522 < 2e-16 ***
## goat_w2 -0.333540 0.048524 -6.874 6.26e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2040.9 on 2117 degrees of freedom
## Residual deviance: 1398.2 on 2114 degrees of freedom
## AIC: 1406.2
##
## Number of Fisher Scoring iterations: 7
#double check the VIF for each final model, just to be sure
vif(top.biotic)
## DistFromHumanAccess2 deer_w2 goat_w2
## 1.039378 1.020137 1.022622
# Environmental Model - top model is model 11
top.env <- glm(used ~ Elevation2 + DistFromHighHumanAccess2 + openConif+modConif+closedConif+mixed+herb+shrub+water+burn, family=binomial(logit), data=wolfkde3)
summary(top.env)
##
## Call:
## glm(formula = used ~ Elevation2 + DistFromHighHumanAccess2 +
## openConif + modConif + closedConif + mixed + herb + shrub +
## water + burn, family = binomial(logit), data = wolfkde3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0290 -0.5020 -0.1576 -0.0366 3.2732
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 9.570e+00 8.805e-01 10.869 < 2e-16 ***
## Elevation2 -6.782e-03 4.883e-04 -13.888 < 2e-16 ***
## DistFromHighHumanAccess2 1.867e-04 3.511e-05 5.317 1.05e-07 ***
## openConif 8.457e-01 4.404e-01 1.920 0.0548 .
## modConif -1.716e-02 3.836e-01 -0.045 0.9643
## closedConif -1.126e-01 3.944e-01 -0.286 0.7752
## mixed 1.325e+00 5.435e-01 2.438 0.0148 *
## herb 8.564e-01 5.525e-01 1.550 0.1212
## shrub 5.781e-01 4.486e-01 1.289 0.1974
## water 8.559e-01 6.389e-01 1.340 0.1804
## burn 1.861e+00 4.629e-01 4.021 5.80e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2040.9 on 2117 degrees of freedom
## Residual deviance: 1298.3 on 2107 degrees of freedom
## AIC: 1320.3
##
## Number of Fisher Scoring iterations: 7
vif(top.env)
## Elevation2 DistFromHighHumanAccess2 openConif
## 2.186429 2.026714 2.892384
## modConif closedConif mixed
## 7.720150 5.317916 1.857145
## herb shrub water
## 1.778124 2.733747 1.515043
## burn
## 2.322895
# Calculate model selection AIC table
require(AICcmodavg)
models = list(top.biotic, top.env)
modnames = c("envfinal", "bioticmodel")
aictab(models, modnames)
##
## Model selection based on AICc:
##
## K AICc Delta_AICc AICcWt Cum.Wt LL
## bioticmodel 11 1320.41 0.00 1 1 -649.14
## envfinal 4 1406.25 85.85 0 1 -699.12
# First review here in Lab http://www.statmethods.net/stats/rdiagnostics.html for Linear Regression (Normal regression)
plot(top.env)
## this cylces through residual plots which look decidedly different from 'normal' residual plots
##### Saving predictions manually, an example with the environment model
wolfkde3$fitted.top.env <- fitted(top.env)
#### this is the predicted probability from the model
wolfkde3$residuals.top.env <- residuals(top.env)
## these are the deviations from the predictions for each row (data point)
wolfkde3$rstudent.top.env <- rstudent(top.env)
## This is a standardized residual - the studentized residual
wolfkde3$hatvalues.top.env <- hatvalues(top.env)
#### this is the first of the leverage statistics, the larger hat value is, the bigger the influence on the fitted value
wolfkde3$cooks.distance.top.env <- cooks.distance(top.env)
#### this is the Cooks leverage statistic, the larger hat value is, the bigger the influence on the fitted value
wolfkde3$obsNumber <- 1:nrow(wolfkde3) ## just added a row number for plotting
## Making manual residual verus predicted plots
plot(wolfkde3$fitted.top.env, wolfkde3$residuals.top.env)
ggplot(wolfkde3, aes(fitted.top.env, residuals.top.env)) + geom_point() + geom_text(aes(label = obsNumber, colour = used))
## Manual residual plot
ggplot(wolfkde3, aes(wolfkde3$residuals.top.env, wolfkde3$cooks.distance.top.env)) + geom_point() + geom_text(aes(label = obsNumber, colour = used))
## shows us some points at high cooks values that might be having a big influence
ggplot(wolfkde3, aes(wolfkde3$cooks.distance.top.env, wolfkde3$hatvalues.top.env)) + geom_point() + geom_text(aes(label = obsNumber, colour = used))
## shows us some points at high cooks values that might be having a big influence
## this helps identify some locations that have high leverage that are used (1 - blue) and available (0=black) points
## e.g., datapoint 16
wolfkde3[16,]
## deer_w2 moose_w2 elk_w2 sheep_w2 goat_w2 wolf_w2 Elevation2
## 19 4 4 4 4 3 4 2086.334
## DistFromHumanAccess2 DistFromHighHumanAccess2 landcover16 EASTING
## 19 1289.253 1595.905 1 570610
## NORTHING pack used usedFactor habitatType landcov.f closedConif
## 19 5710430 Red Deer 1 1 Open Conifer Open Conifer 0
## modConif openConif decid regen mixed herb shrub water rockIce burn
## 19 0 1 0 0 0 0 0 0 0 0
## alpineHerb alpineShrub alpine fitted.top.env residuals.top.env
## 19 0 0 0 0.03119327 2.633459
## rstudent.top.env hatvalues.top.env cooks.distance.top.env obsNumber
## 19 2.654942 0.003644757 0.0103663 16
## is a red deer wolf used point at high elevtion in open conifer far from human access.
## Does not seem to be a data entry error - a REAL data point
#Evaluating model fit graphically
# first lets make a plot of Y against X predictions...
plot(wolfkde3$Elevation2, wolfkde3$fitted.top.env)
##### Another way of looking at it
scatterplot(fitted.top.env~Elevation2, reg.line=lm, smooth=TRUE, spread=TRUE, boxplots='xy',
span=0.5, xlab="elevation", ylab="residual", cex=1.5, cex.axis=1.4, cex.lab=1.4,
data=wolfkde3)
hist(wolfkde3$fitted.top.env, scale="frequency", breaks="Sturges", col="darkgray")
#### This plot is VERY important - it is the predicted probability of a location being a wolf used location given your top model. But how would we divide this into wolf habitat and wolf available?
ggplot(wolfkde3, aes(x=wolfkde3$fitted.top.env, fill=usedFactor)) + geom_histogram(binwidth=0.05, position="identity", alpha=0.7) + xlab("Predicted Probability of Wolf Use") + theme(axis.title.x=element_text(size=16)) #+ facet_grid(pack ~ ., scales="free")
ggplot(wolfkde3, aes(x=fitted.top.env, y=..density.., fill=usedFactor)) + geom_histogram(binwidth=0.05, position="identity", alpha=0.7) + xlab("Predicted Probability of Wolf Use") + theme(axis.title.x=element_text(size=16)) + facet_grid(pack ~ ., scales="free")
###### But note this ‘cut’ point looks different for both wolf packs? This introduces the basic problem of dividing continuous predictions from logistic regression into categories of habitat (1) and available (0)
plot(top.biotic)
##### Saving predictions manually, an example with the environment model
wolfkde3$fitted.top.biotic <- fitted(top.biotic)
#### this is the predicted probability from the model
wolfkde3$residuals.top.biotic <- residuals(top.biotic)
## these are the deviations from the predictions for each row (data point)
wolfkde3$rstudent.top.biotic <- rstudent(top.biotic)
## This is a standardized residual - the studentized residual
wolfkde3$hatvalues.top.biotic <- hatvalues(top.biotic)
#### this is the first of the leverage statistics, the larger hat value is, the bigger the influence on the fitted value
wolfkde3$cooks.distance.top.biotic <- cooks.distance(top.biotic)
#### This isthe Cooks leverage statistic
## Making manual residual verus predicted plots
plot(wolfkde3$fitted.top.biotic, wolfkde3$residuals.top.biotic)
ggplot(wolfkde3, aes(wolfkde3$cooks.distance.top.biotic, wolfkde3$hatvalues.top.biotic)) + geom_point() + geom_text(aes(label = obsNumber, colour = used))
## this helps identify some locations that have high leverage that are used (1 - blue) and available (0=black) points
## e.g., datapoint 16
wolfkde3[13,]
## deer_w2 moose_w2 elk_w2 sheep_w2 goat_w2 wolf_w2 Elevation2
## 14 4 4 4 3 3 4 2007.453
## DistFromHumanAccess2 DistFromHighHumanAccess2 landcover16 EASTING
## 14 2105.2 9342.662 2 575450
## NORTHING pack used usedFactor habitatType landcov.f
## 14 5717684 Red Deer 1 1 Moderate Conifer Moderate Conifer
## closedConif modConif openConif decid regen mixed herb shrub water
## 14 0 1 0 0 0 0 0 0 0
## rockIce burn alpineHerb alpineShrub alpine fitted.top.env
## 14 0 0 0 0 0 0.08969027
## residuals.top.env rstudent.top.env hatvalues.top.env
## 14 2.196084 2.206977 0.004703072
## cooks.distance.top.env obsNumber fitted.top.biotic residuals.top.biotic
## 14 0.004380538 13 0.01881658 2.818871
## rstudent.top.biotic hatvalues.top.biotic cooks.distance.top.biotic
## 14 2.836193 0.00187503 0.02453511
## is a red deer wolf used point at high elevtion in open conifer far from human access that is being classified as an AVAILABLE location.
wolfkde3[30,]
## deer_w2 moose_w2 elk_w2 sheep_w2 goat_w2 wolf_w2 Elevation2
## 35 3 4 4 4 4 4 2169.713
## DistFromHumanAccess2 DistFromHighHumanAccess2 landcover16 EASTING
## 35 1704.151 9264.565 2 580310
## NORTHING pack used usedFactor habitatType landcov.f
## 35 5708450 Red Deer 1 1 Moderate Conifer Moderate Conifer
## closedConif modConif openConif decid regen mixed herb shrub water
## 35 0 1 0 0 0 0 0 0 0
## rockIce burn alpineHerb alpineShrub alpine fitted.top.env
## 35 0 0 0 0 0 0.03129583
## residuals.top.env rstudent.top.env hatvalues.top.env
## 35 2.632212 2.644415 0.002075902
## cooks.distance.top.env obsNumber fitted.top.biotic residuals.top.biotic
## 35 0.005865753 30 0.009800049 3.041502
## rstudent.top.biotic hatvalues.top.biotic cooks.distance.top.biotic
## 35 3.053085 0.0006981523 0.01766003
## another high wolf used point that is being classified as an AVAILABLE location
#Evaluating model fit graphically
# first lets make a plot of Y against X predictions...
plot(wolfkde3$Elevation2, wolfkde3$fitted.top.biotic)
##### Another way of looking at it
scatterplot(fitted.top.biotic~Elevation2, reg.line=lm, smooth=TRUE, spread=TRUE, boxplots='xy',
span=0.5, xlab="elevation", ylab="residual", cex=1.5, cex.axis=1.4, cex.lab=1.4,
data=wolfkde3)
hist(wolfkde3$fitted.top.biotic, scale="frequency", breaks="Sturges", col="darkgray")
hist(wolfkde3$fitted.top.biotic, scale="frequency", breaks="Sturges", col="darkgray")
##### This plot is VERY important - it is the predicted probability of a location being a wolf used location given your top model
##### But how would we divide this into wolf habitat and wolf available?
ggplot(wolfkde3, aes(x=wolfkde3$fitted.top.biotic, fill=usedFactor)) + geom_histogram(binwidth=0.05, position="identity", alpha=0.7) + xlab("Predicted Probability of Wolf Use") + theme(axis.title.x=element_text(size=16)) #+ facet_grid(pack ~ ., scales="free")
#### This plot shows that somewhere around 0.25 - 0.40 eyeballing it it looks like we could 'cut' used and available points?
ggplot(wolfkde3, aes(x=fitted.top.biotic, y=..density.., fill=usedFactor)) + geom_histogram(binwidth=0.05, position="identity", alpha=0.7) + xlab("Predicted Probability of Wolf Use") + theme(axis.title.x=element_text(size=16)) + facet_grid(pack ~ ., scales="free")
#### But note this 'cut' point looks different for both wolf packs?
ggplot(wolfkde3, aes(x=fitted.top.biotic, y=fitted.top.env, fill = pack)) + geom_point() + stat_smooth(method="lm")
## there is quite a bit of scatter here in the predictions between the two models, but in general, they are highly correlated
ggplot(wolfkde3, aes(x=fitted.top.biotic, y=fitted.top.env, fill = pack)) + geom_point() + stat_smooth()
## but some evidence that the top environmental model is not succesfulyl predicting the 'best' wolf habitat especially in the bow valley wolf pack compared to the top biotic model
# First we will arbitrarily define the cutpoint between 1 and 0's using p = 0.5
ppused = wolfkde3$fitted.top.biotic>0.5
table(ppused,wolfkde3$used)
##
## ppused 0 1
## FALSE 1655 229
## TRUE 67 167
### now go through Hosmer and Lemeshow chapter 5 to calculate our classification success for 0's?
167/(167+229)
## [1] 0.4217172
1655/(1655+67)
## [1] 0.9610918
## Now ltes do this manually using cutpoints of 0.25 and 0.1
ppused = wolfkde3$fitted.top.biotic>0.25
table(ppused,wolfkde3$used)
##
## ppused 0 1
## FALSE 1376 92
## TRUE 346 304
#### Now, what is specificity? (i.e., the probability of classifying the 1's correctly?)
304/(304+92)
## [1] 0.7676768
#### about 76% - Great! But - what happened to our sensitivity (i.e., the probability of classifying the 0's correctly?)
1376 / (1376+346)
## [1] 0.7990708
#### so our probability of classifying 0's correctly decreases to ~ 80% with our sensitivity
### now lets try a p = 0.10
ppused = wolfkde3$fitted.top.biotic>0.10
table(ppused,wolfkde3$used)
##
## ppused 0 1
## FALSE 1001 39
## TRUE 721 357
#### Now, what is specificity? (i.e., the probability of classifying the 1's correctly?)
357/(357+39)
## [1] 0.9015152
#### about 90% - Great! But - what happened to our sensitivity (i.e., the probability of classifying the 0's correctly?)
1001 / (1001+721)
## [1] 0.5813008
#### so our probability of classifying 0's correctly decreases with our sensitivity
require(caret)
wolfkde3$pr.top.biotic.used <- ifelse(wolfkde3$fitted.top.biotic>0.5, 1, 0)
xtab1<-table(wolfkde3$pr.top.biotic.used, wolfkde3$used)
xtab1
##
## 0 1
## 0 1655 229
## 1 67 167
#?confusionMatrix
confusionMatrix(xtab1)
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 1655 229
## 1 67 167
##
## Accuracy : 0.8602
## 95% CI : (0.8447, 0.8747)
## No Information Rate : 0.813
## P-Value [Acc > NIR] : 4.668e-09
##
## Kappa : 0.4544
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9611
## Specificity : 0.4217
## Pos Pred Value : 0.8785
## Neg Pred Value : 0.7137
## Prevalence : 0.8130
## Detection Rate : 0.7814
## Detection Prevalence : 0.8895
## Balanced Accuracy : 0.6914
##
## 'Positive' Class : 0
##
### This reveals a LOT of information - lets compare these values to the table in the ? confusionMatrix help file, and in the lab manual.
require(ROCR)
pp = predict(top.biotic,type="response")
pred = prediction(pp, wolfkde3$used)
perfClass <- performance(pred, "tpr","fpr") # change 2nd and/or 3rd arguments for other metrics
plot(perfClass)
#Calculate the Area Under Curve
BMauc <- performance(pred, measure="auc")
str(BMauc)
## Formal class 'performance' [package "ROCR"] with 6 slots
## ..@ x.name : chr "None"
## ..@ y.name : chr "Area under the ROC curve"
## ..@ alpha.name : chr "none"
## ..@ x.values : list()
## ..@ y.values :List of 1
## .. ..$ : num 0.868
## ..@ alpha.values: list()
auc <- as.numeric(BMauc@y.values)
auc
## [1] 0.8678605
#Max the sum of sensitivity and specificity
fpr <- perfClass@x.values[[1]]
tpr <- perfClass@y.values[[1]]
sum <- tpr + (1-fpr)
index <- which.max(sum)
cutoff <- perfClass@alpha.values[[1]][[index]]
cutoff
## [1] 0.2363158
## thus the cutpoint that maximizes the overall classification succes is 0.236
#Plot ROC Curve with cut point and AUC
plot(perfClass, colorize = T, lwd = 5, print.cutoffs.at=seq(0,1,by=0.1),
text.adj=c(1.2,1.2),
main = "ROC Curve")
text(0.5, 0.5, "AUC = 0.867")
abline(v=cutoff, col = "red", lwd = 3)
### now lets try a p = of our cutoff
ppused = wolfkde3$fitted.top.biotic>cutoff
table(ppused,wolfkde3$used)
##
## ppused 0 1
## FALSE 1344 76
## TRUE 378 320
#### Now, what is specificity? (i.e., the probability of classifying the 1's correctly?)
320/(320+76)
## [1] 0.8080808
#### about 80% - Great! But - what happened to our sensitivity (i.e., the probability of classifying the 0's correctly?)
1344 / (1344+378)
## [1] 0.7804878
#### so our probability of classifying 0's correctly is about 78%
wolfkde3$pr.top.biotic.used2 <- ifelse(wolfkde3$fitted.top.biotic>cutoff, 1, 0)
xtab2<-table(wolfkde3$pr.top.biotic.used2, wolfkde3$used)
xtab2
##
## 0 1
## 0 1344 76
## 1 378 320
#?confusionMatrix
confusionMatrix(xtab2)
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 1344 76
## 1 378 320
##
## Accuracy : 0.7856
## 95% CI : (0.7675, 0.803)
## No Information Rate : 0.813
## P-Value [Acc > NIR] : 0.9993
##
## Kappa : 0.455
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7805
## Specificity : 0.8081
## Pos Pred Value : 0.9465
## Neg Pred Value : 0.4585
## Prevalence : 0.8130
## Detection Rate : 0.6346
## Detection Prevalence : 0.6704
## Balanced Accuracy : 0.7943
##
## 'Positive' Class : 0
##
## this is our best model classifying used and avail locations into 1 and 0's.
ggplot(wolfkde3, aes(x=wolfkde3$fitted.top.biotic, fill=usedFactor)) + geom_histogram(binwidth=0.05, position="identity", alpha=0.7) + xlab("Predicted Probability of Wolf Use") + theme(axis.title.x=element_text(size=16)) + geom_vline(xintercept = cutoff, col="red")
##### This graph shows the optimal cutpoint based on our data
table(wolfkde3$used)
##
## 0 1
## 1722 396
396/(1722+396)
## [1] 0.1869688
## note that its VERY similar to our sampling fraction - the fact that it is bigger than the basic sampling fraction is interesting, but its essentially determined by the U/(U+A)
source("/Users/mark.hebblewhite/Dropbox/WILD 562/Spring2017/lab6/kxv.R", verbose = FALSE)
## Module kxv.R loaded.
##
## Type kxvtest() to run the following example of a fixed-effects
## model with sample Elk data:
##
## elk <- read.csv(file="HR_Summer.csv", header=T)
## kxvglm( PTTYPE~DEM30M+DEM30M^2+VARCES1+FORGSUM+FORGSUM^2, elk, k=5,nbin=10)
##
## The test will do kxvPrintFlag <- TRUE in order to
## print a summary of each fitted model, but this
## flag is FALSE by default.
##
## To turn off plotting, do kxvPlotFlag <- FALSE.
# Kfolds with a 'fuzz' factor
kxvPrintFlag=FALSE
kxvPlotFlag=TRUE
kxvFuzzFactor = 0.01
kfolds = kxvglm(top.env$formula, data=wolfkde3, k=5, nbin=10)
kfolds
## r.rho p
## 1 0.9478045 3.048217e-05
## 2 0.7754764 8.393730e-03
## 3 0.9535451 1.926125e-05
## 4 0.9374369 6.212522e-05
## 5 0.9629500 7.882961e-06
## (meanfreq) 0.9665698 5.248116e-06
##### These values tell you the spearman rank correlation between every subset of the data 1 - 5 and the predicted correlation between the # of observations in ranked categories of habitat from 1, 10.
##### But note that this K-folds cross validation does not address the structure of the data within wolf packs
##### NExt step is to subset by wolf pack and see how well the overall wolf model predicts wolf use by both wolf packs
# Kfolds by each pack with a 'fuzz' factor
kxvPrintFlag=FALSE
kxvPlotFlag=TRUE
kxvFuzzFactor = 0.01
kfolds2 = kxvglm(top.env$formula, data=wolfkde3, k=5, nbin=10, partition="pack")
kfolds2
## r.rho p
## Bow Valley 0.7963563 0.005836997
## Red Deer -0.2324257 0.518156318
## (meanfreq) 0.7575758 0.015920829
# Create a vector of random "folds" in this case 5, 1:5
wolfkde3$rand.vec = sample(1:5,nrow(wolfkde3),replace=TRUE)
#Run the model for 1 single random subset of the data == 1
top.env.1= glm(used ~ Elevation2 + DistFromHighHumanAccess2 + openConif+modConif+closedConif+mixed+herb+shrub+water+burn, family=binomial(logit), data=wolfkde3, subset=rand.vec==1)
# Make predictions for points not used in this random subset (2:5) to fit the model.
pred.prob = predict(top.env.1,newdata=wolfkde3[wolfkde3$rand.vec!=1,],type="response")
# Make quantiles for the predictions - this calculates the 'bin's of the categories of habitat availability
q.pp = quantile(pred.prob,probs=seq(0,1,.1))
# Then for each of 10 bins, put each row of data into a bin
bin = rep(NA,length(pred.prob))
for (i in 1:10){
bin[pred.prob>=q.pp[i]&pred.prob<q.pp[i+1]] = i
}
## This then makes a count of just the used locations for all other K folds 2:5
used1 = wolfkde3$used[wolfkde3$rand.vec!=1]
## We then make a table of them
rand.vec.1.table <- table(used1,bin)
rand.vec.1.table
## bin
## used1 1 2 3 4 5 6 7 8 9 10
## 0 172 164 171 163 166 159 134 137 80 53
## 1 0 8 2 8 6 14 38 34 92 119
## this basically shows the data in each bin for used and available locations. If the model is any 'good', then high ranking habitat bins (7, 8, 9, 10) should have a lot more used locaitons in them.
cor.test(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), c(0,2,0,8,6,15,24,50,99,110), method="spearman")
##
## Spearman's rank correlation rho
##
## data: c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) and c(0, 2, 0, 8, 6, 15, 24, 50, 99, 110)
## S = 5.516, p-value = 5.248e-06
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9665698
## which suggests that in this random fold of the data, the model predicted habitat use well
par(mfrow = c(1,1))
###### 5.1 Easy spatial predictions
ggplot(wolfkde3, aes(EASTING, NORTHING, col = fitted.top.biotic)) + geom_point(size=5) + coord_equal() + scale_colour_gradient(low = 'yellow', high = 'red')
ggplot(wolfkde3, aes(EASTING, NORTHING, col = fitted.top.env)) + geom_point(size=5) + coord_equal() + scale_colour_gradient(low = 'yellow', high = 'red')
##### What did we just do? What is this a map of?
par(mfrow = c(1,1))
summary(top.biotic)
##
## Call:
## glm(formula = used ~ DistFromHumanAccess2 + deer_w2 + goat_w2,
## family = binomial(logit), data = wolfkde3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8345 -0.6246 -0.1888 -0.0306 3.1247
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.553037 0.366553 -9.693 < 2e-16 ***
## DistFromHumanAccess2 -0.001422 0.000183 -7.770 7.86e-15 ***
## deer_w2 0.898069 0.077941 11.522 < 2e-16 ***
## goat_w2 -0.333540 0.048524 -6.874 6.26e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2040.9 on 2117 degrees of freedom
## Residual deviance: 1398.2 on 2114 degrees of freedom
## AIC: 1406.2
##
## Number of Fisher Scoring iterations: 7
#> top.biotic$coefficients
#(Intercept) DistFromHumanAccess2 deer_w2 goat_w2
#-3.553037526 -0.001421547 0.898069385 -0.333539833
biotic.coefs <- top.biotic$coefficients[c(1:4)]
names(all_rasters)
## [1] "deer_w2" "moose_w2"
## [3] "elk_w2" "sheep_w2"
## [5] "goat_w2" "wolf_w2"
## [7] "Elevation2" "DistFromHumanAccess2"
## [9] "DistFromHighHumanAccess2" "landcover2"
## [11] "layer.1" "layer.2"
## [13] "layer.3" "layer.4"
## [15] "layer.5" "layer.6"
## [17] "layer.7" "layer.8"
## [19] "layer.9"
##### Note that this step takes a long time.
rast.top.biotic <- exp(biotic.coefs[1] + biotic.coefs[2]*disthumanaccess2 + biotic.coefs[3]*deer_w + biotic.coefs[4]*goat_w) / (1 +exp(biotic.coefs[1] + biotic.coefs[2]*disthumanaccess2 + biotic.coefs[3]*deer_w + biotic.coefs[4]*goat_w ))
## need to use the names of the raster layers we brought in up above. Note that they are not the same names as stored in the Raster stack
# plot predicted raster
plot(rast.top.biotic, col=colorRampPalette(c("yellow", "orange", "red"))(255))
plot(rast.top.biotic, col=colorRampPalette(c("yellow", "orange", "red"))(255), ext=kernels)
plot(kernelHR, add=TRUE)
#look at histogram of predicted values
#hist(rast.top.biotic@data@values) ## note this isnt working for some reasong
rast.top.biotic.RSF <- exp(biotic.coefs[1] + biotic.coefs[2]*disthumanaccess2 + biotic.coefs[3]*deer_w + biotic.coefs[4]*goat_w)
plot(rast.top.biotic.RSF, col=colorRampPalette(c("yellow", "orange", "red"))(255), extent=kernels)